##########
# Crossing Borders: STATPOP cleaning
##########

library(here)
library(data.table)

source(here("code", "resources", "country_codes.R")) # get country codes for Petra/Statpop data

paste8 <- function(vec) as.numeric(paste0("8",vec))

muni_organize_STATPOP <- function(year){
  
  dat <- read_dta(here("data", "cleaning", "STATPOP", paste0("STATPOP_", year, ".dta"))); setDT(dat)
  
  set(dat, j = "west_eu", value = dat$nationalitystate %in% paste8(Western_EU))
  set(dat, j = "south_eu", value = dat$nationalitystate %in% paste8(South_EU))
  set(dat, j = "ce_eu", value = dat$nationalitystate %in% paste8(CentralEast_EU))
  set(dat, j = "former_yugo", value = dat$nationalitystate %in% paste8(Former_Yugo))
  set(dat, j = "turkey", value = dat$nationalitystate %in% paste8(Turkey))
  set(dat, j = "other", value = dat$nationalitystate >= 8300 | dat$nationalitystate < 0)
  set(dat, j = "germany", value = dat$nationalitystate %in% paste8(Germany))
  set(dat, j = "france", value = dat$nationalitystate %in% paste8(France))
  set(dat, j = "austria", value = dat$nationalitystate %in% paste8(Austria))
  set(dat, j = "italy", value = dat$nationalitystate %in% paste8(Italy))
  set(dat, j = "liechtenstein", value = dat$nationalitystate %in% paste8(Liechtenstein))
  set(dat, j = "border", value = dat$nationalitystate %in% paste8(Border))
  
  dat[cw[year == year], on = c(reportingmunicipalityid = "bfs"), bfs19 := bfs19] #looks like statpop uses the vintage of municipalities that matches a given year. 
  
  muni_dat <- dat[ 
    , .("west_eu" = sum(west_eu, na.rm = T),
        "south_eu" = sum(south_eu, na.rm = T),
        "ce_eu" = sum(ce_eu, na.rm = T),
        "former_yugo" = sum(former_yugo, na.rm = T),
        "turkey" = sum(turkey, na.rm = T),
        "other" = sum(other, na.rm = T),
        "germany" = sum(germany, na.rm = T),
        "france" = sum(france, na.rm = T),
        "italy" = sum(italy, na.rm = T),
        "austria" = sum(austria, na.rm = T),
        "liechtenstein" = sum(liechtenstein, na.rm = T),
        "border" = sum(border, na.rm = T),
        "all_imm" = .N),
    by = bfs19]
  
  missing <- setdiff_dw(muni_dat$bfs19, unique(cw$bfs19))$in_y_not_x
  
  if(length(missing) > 0){
    message("The following ", length(missing), " municipalities are not found in STATPOP in ", year,". Imputing them as having 0 immigrants: ", paste0(missing, collapse = ", "))
    
    replacement <- data.table(matrix(0L, nrow = length(missing), ncol = ncol(muni_dat)))
    setnames(replacement, colnames(muni_dat))
    set(replacement, j = "bfs19", value = missing)
    muni_dat <- rbind(muni_dat, replacement)
  }
  
  set(muni_dat, j = "year", value = year)
  
  return(muni_dat)
}

statpop <- rbindlist(lapply(c(2010L:2014L), muni_organize_STATPOP))

message(
  "You've created the following data.table:
  * statpop: contains information on immigrants per municipality based on origin."
)

rm(Austria, Border, CentralEast_EU, Former_Yugo, France, Germany, Italy, Liechtenstein, South_EU, Turkey, Western_EU, muni_organize_STATPOP, paste8)
